From dea8eefd6214e3ad5b54795fa958ab721d58710c Mon Sep 17 00:00:00 2001 From: "kaf24@scramble.cl.cam.ac.uk" Date: Sat, 31 Jan 2004 19:45:13 +0000 Subject: [PATCH] bitkeeper revision 1.699 (401c05c9TV2zsaZ_e3zpy-zaKxCetw) timer.c, timer.h, sched.h: new file Many files: Rolf's new timer interface, plus various cleanups. --- .rootkeys | 3 + docs/interface.tex | 69 +- extras/mini-os/h/hypervisor.h | 34 +- extras/mini-os/time.c | 40 +- tools/misc/Makefile | 4 +- tools/misc/xen_read_console.c | 11 +- tools/xc/lib/Makefile | 8 +- xen/arch/i386/entry.S | 1 + xen/arch/i386/time.c | 6 - xen/common/ac_timer.c | 6 - xen/common/keyhandler.c | 44 +- xen/common/schedule.c | 262 +++-- xen/drivers/block/xen_vbd.c | 2 +- xen/include/hypervisor-ifs/hypervisor-if.h | 32 +- xen/include/xeno/sched.h | 40 +- xen/net/dev.c | 11 + xenolinux-2.4.24-sparse/arch/xeno/config.in | 4 +- xenolinux-2.4.24-sparse/arch/xeno/defconfig | 5 +- .../arch/xeno/drivers/network/network.c | 13 +- .../arch/xeno/kernel/process.c | 30 +- .../arch/xeno/kernel/time.c | 155 ++- .../include/asm-xeno/hypervisor.h | 24 + xenolinux-2.4.24-sparse/include/linux/sched.h | 966 +++++++++++++++++ xenolinux-2.4.24-sparse/include/linux/timer.h | 77 ++ xenolinux-2.4.24-sparse/kernel/panic.c | 3 +- xenolinux-2.4.24-sparse/kernel/timer.c | 968 ++++++++++++++++++ 26 files changed, 2588 insertions(+), 230 deletions(-) create mode 100644 xenolinux-2.4.24-sparse/include/linux/sched.h create mode 100644 xenolinux-2.4.24-sparse/include/linux/timer.h create mode 100644 xenolinux-2.4.24-sparse/kernel/timer.c diff --git a/.rootkeys b/.rootkeys index 65a76aac8b..312d4df82a 100644 --- a/.rootkeys +++ b/.rootkeys @@ -581,11 +581,14 @@ 3f689063nhrIRsMMZjZxMFk7iEINqQ xenolinux-2.4.24-sparse/include/asm-xeno/xeno_proc.h 3f056927gMHl7mWB89rb73JahbhQIA xenolinux-2.4.24-sparse/include/linux/blk.h 3e5a4e68WLX3B8owTvktP3HHOtznPQ xenolinux-2.4.24-sparse/include/linux/major.h +401c0590D_kwJDU59X8NyvqSv_Cl2A xenolinux-2.4.24-sparse/include/linux/sched.h 3e5a4e686V0nioX2ZpFf056sgvdiQw xenolinux-2.4.24-sparse/include/linux/sunrpc/debug.h +401c0592pLrp_aCbQRo9GXiYQQaVVA xenolinux-2.4.24-sparse/include/linux/timer.h 3e5a4e68W_hpMlM3u_-QOKMp3gzcwQ xenolinux-2.4.24-sparse/init/do_mounts.c 3e5a4e68TJJavrunYwTAnLRSBxSYqQ xenolinux-2.4.24-sparse/kernel/panic.c 3f1056a9LXNTgSzITNh1mb-MIKV1Ng xenolinux-2.4.24-sparse/kernel/printk.c 3f9d4b44247udoqWEgFkaHiWv6Uvyg xenolinux-2.4.24-sparse/kernel/time.c +401c059bjLBFYHRD4Py2uM3eA1D4zQ xenolinux-2.4.24-sparse/kernel/timer.c 3eba8f878XjouY21EkQBXwYBsPsipQ xenolinux-2.4.24-sparse/lndir-rel 3e6e7c1efbQe93xCvOpOVCnXTMmQ5w xenolinux-2.4.24-sparse/mkbuildtree 3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.24-sparse/mm/memory.c diff --git a/docs/interface.tex b/docs/interface.tex index ac942658db..1c4ca937d7 100644 --- a/docs/interface.tex +++ b/docs/interface.tex @@ -117,18 +117,20 @@ time. \section{Cycle counter time} -This provides the finest-grained, free-running time reference, with the approximate -frequency being publicly accessible. The cycle counter time is used to accurately -extrapolate the other time references. On SMP machines it is currently assumed -that the cycle counter time is synchronised between CPUs. The current x86-based -implementation achieves this within inter-CPU communication latencies. +This provides the finest-grained, free-running time reference, with the +approximate frequency being publicly accessible. The cycle counter time is +used to accurately extrapolate the other time references. On SMP machines +it is currently assumed that the cycle counter time is synchronised between +CPUs. The current x86-based implementation achieves this within inter-CPU +communication latencies. \section{System time} -This is a 64-bit value containing the nanoseconds elapsed since boot time. Unlike -cycle counter time, system time accurately reflects the passage of real time, i.e. -it is adjusted several times a second for timer drift. This is done by running an -NTP client in {\it domain0} on behalf of the machine, feeding updates to the -hypervisor. Intermediate values can be extrapolated using the cycle counter. +This is a 64-bit value containing the nanoseconds elapsed since boot +time. Unlike cycle counter time, system time accurately reflects the +passage of real time, i.e. it is adjusted several times a second for timer +drift. This is done by running an NTP client in {\it domain0} on behalf of +the machine, feeding updates to the hypervisor. Intermediate values can be +extrapolated using the cycle counter. \section{Wall clock time} This is the actual ``time of day'' Unix style struct timeval (i.e. seconds and @@ -140,10 +142,39 @@ and remain perfectly in time. \section{Domain virtual time} -This progresses at the same pace as cycle counter time, but only while a domain -is executing. It stops while a domain is de-scheduled. Therefore the share of the -CPU that a domain receives is indicated by the rate at which its domain virtual -time increases, relative to the rate at which cycle counter time does so. +This progresses at the same pace as cycle counter time, but only while a +domain is executing. It stops while a domain is de-scheduled. Therefore the +share of the CPU that a domain receives is indicated by the rate at which +its domain virtual time increases, relative to the rate at which cycle +counter time does so. + +\section{Time interface} +Xen exports some timestamps to guest operating systems through their shared +info page. Timestamps are provided for system time and wall-clock time. Xen +also provides the cycle counter values at the time of the last update +allowing guests to calculate the current values. The cpu frequency and a +scaling factor are provided for guests to convert cycle counter values to +real time. Since all time stamps need to be updated and read +\emph{atomically} two version numbers are also stored in the shared info +page. + +Xen will ensure that the time stamps are updated frequently enough to avoid +an overflow of the cycle counter values. Guest can check if its notion of +time is up-to-date by comparing the version numbers. + +\section{Timer events} + +Xen maintains a periodic timer (currently with a 10ms period) which sends a +timer event to the currently executing domain. This allows Guest OSes to +keep track of the passing of time when executing. The scheduler also +arranges for a newly activated domain to receive a timer event when +scheduled so that the Guest OS can adjust to the passage of time while it +has been inactive. + +In addition, Xen exports a hypercall interface to each domain which allows +them to request a timer event send to them at the specified system +time. Guest OSes may use this timer to implemented timeout values when they +block. \chapter{Memory} @@ -371,7 +402,15 @@ Notify hypervisor of updates to transmit and/or receive descriptor rings. Notify hypervisor that fpu registers needed to be save on context switch. \section{ sched\_op(unsigned long op)} -Request scheduling operation from hypervisor. The options are: yield, stop, and exit. +Request scheduling operation from hypervisor. The options are: {\it yield}, +{\it block}, {\it stop}, and {\it exit}. {\it yield} keeps the calling +domain run-able but may cause a reschedule if other domains are +run-able. {\it block} removes the calling domain from the run queue and the +domains sleeps until an event is delivered to it. {\it stop} and {\it exit} +should be self-explanatory. + +\section{ set\_dom\_timer(dom\_timer\_arg\_t *timer\_arg)} +Request a timer event to be sent at the specified system time. \section{ dom0\_op(dom0\_op\_t *op)} Administrative domain operations for domain management. The options are: diff --git a/extras/mini-os/h/hypervisor.h b/extras/mini-os/h/hypervisor.h index a4f5625692..92bb37cdd2 100644 --- a/extras/mini-os/h/hypervisor.h +++ b/extras/mini-os/h/hypervisor.h @@ -1,3 +1,10 @@ +/****************************************************************************** + * hypervisor.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002, K A Fraser + */ #ifndef _HYPERVISOR_H_ #define _HYPERVISOR_H_ @@ -135,6 +142,17 @@ static __inline__ int HYPERVISOR_yield(void) return ret; } +static __inline__ int HYPERVISOR_block(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_block) ); + + return ret; +} + static __inline__ int HYPERVISOR_exit(void) { int ret; @@ -146,13 +164,25 @@ static __inline__ int HYPERVISOR_exit(void) return ret; } -static __inline__ int HYPERVISOR_stop(void) +static __inline__ int HYPERVISOR_stop(unsigned long srec) { int ret; + /* NB. On suspend, control software expects a suspend record in %esi. */ __asm__ __volatile__ ( TRAP_INSTR : "=a" (ret) : "0" (__HYPERVISOR_sched_op), - "b" (SCHEDOP_stop) ); + "b" (SCHEDOP_stop), "S" (srec) : "memory" ); + + return ret; +} + +static __inline__ long HYPERVISOR_set_dom_timer(void *timer_arg) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_dom_timer), + "b" (timer_arg) : "memory" ); return ret; } diff --git a/extras/mini-os/time.c b/extras/mini-os/time.c index 447e164987..12356b0a03 100644 --- a/extras/mini-os/time.c +++ b/extras/mini-os/time.c @@ -1,20 +1,14 @@ /* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- **************************************************************************** * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge + * (C) 2002-2003 - Keir Fraser - University of Cambridge **************************************************************************** * * File: time.c - * Author: Rolf Neugebauer (neugebar@dcs.gla.ac.uk) - * Changes: - * - * Date: Jul 2003 - * - * Environment: Xen Minimal OS + * Author: Rolf Neugebauer and Keir Fraser + * * Description: Simple time and timer functions * - **************************************************************************** - * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $ - **************************************************************************** * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the @@ -105,6 +99,29 @@ static __inline__ unsigned long get_time_delta_usecs(void) return (unsigned long)delta; } +s64 get_s_time (void) +{ + u64 u_delta; + s64 ret; + + again: + + u_delta = get_time_delta_usecs(); + ret = shadow_system_time + (1000 * u_delta); + + if ( unlikely(!TIME_VALUES_UP_TO_DATE) ) + { + /* + * We may have blocked for a long time, rendering our calculations + * invalid (e.g. the time delta may have overflowed). Detect that + * and recalculate with fresh values. + */ + get_time_values_from_xen(); + goto again; + } + + return ret; +} void gettimeofday(struct timeval *tv) { @@ -123,11 +140,16 @@ void gettimeofday(struct timeval *tv) } +/* + * Just a dummy + */ static void timer_handler(int ev, struct pt_regs *regs) { static int i; struct timeval tv; + get_time_values_from_xen(); + i++; if (i >= 1000) { gettimeofday(&tv); diff --git a/tools/misc/Makefile b/tools/misc/Makefile index 19d25b4748..597e68da17 100644 --- a/tools/misc/Makefile +++ b/tools/misc/Makefile @@ -16,7 +16,7 @@ all: $(TARGETS) install: all mkdir -p /usr/bin - cp -a $(INSTALL) /usr/bin + cp $(INSTALL) /usr/bin chmod 755 /usr/bin/xen-mkdevnodes chmod 755 /usr/bin/xen_nat_enable chmod 755 /usr/bin/xen-clone @@ -24,7 +24,7 @@ install: all dist: all mkdir -p ../../../install/bin - cp -a $(INSTALL) ../../../install/bin + cp $(INSTALL) ../../../install/bin chmod 755 ../../../install/bin/xen-mkdevnodes chmod 755 ../../../install/bin/xen_nat_enable chmod 755 ../../../install/bin/xen-clone diff --git a/tools/misc/xen_read_console.c b/tools/misc/xen_read_console.c index 766d24f6f3..1352de8a6f 100644 --- a/tools/misc/xen_read_console.c +++ b/tools/misc/xen_read_console.c @@ -11,9 +11,9 @@ int main(void) { - unsigned char buf[208]; + unsigned char buf[208], filtered[208]; struct sockaddr_in addr, from; - int fromlen = sizeof(from); + int fromlen = sizeof(from), i, j; int len, fd = socket(PF_INET, SOCK_DGRAM, 0); if ( fd < 0 ) @@ -46,7 +46,12 @@ int main(void) if ( buf[len-1] != '\n' ) { buf[len] = '\n'; len++; } buf[len] = '\0'; - printf("[%d] %s", ntohs(from.sin_port),buf); + for ( i = 0, j = 0; i < len; i++ ) + if ( (buf[i] == '\n') || (buf[i] == '\0') || + ((buf[i] >= 32) && (buf[i] <= 126)) ) + filtered[j++] = buf[i]; + + printf("[%d] %s", ntohs(from.sin_port), filtered); fromlen = sizeof(from); } diff --git a/tools/xc/lib/Makefile b/tools/xc/lib/Makefile index 2693372048..188478cd25 100644 --- a/tools/xc/lib/Makefile +++ b/tools/xc/lib/Makefile @@ -21,17 +21,17 @@ check-for-zlib: install: all mkdir -p /usr/lib mkdir -p /usr/include - cp -a $(LIB) /usr/lib + cp $(LIB) /usr/lib chmod 755 /usr/lib/$(LIB) - cp -a xc.h /usr/include + cp xc.h /usr/include chmod 644 /usr/include/xc.h dist: all mkdir -p ../../../../install/lib mkdir -p ../../../../install/include - cp -a $(LIB) ../../../../install/lib + cp $(LIB) ../../../../install/lib chmod 755 ../../../../install/lib/$(LIB) - cp -a xc.h ../../../../install/include + cp xc.h ../../../../install/include chmod 644 ../../../../install/include/xc.h clean: diff --git a/xen/arch/i386/entry.S b/xen/arch/i386/entry.S index 08824dd920..4a135e5212 100644 --- a/xen/arch/i386/entry.S +++ b/xen/arch/i386/entry.S @@ -713,6 +713,7 @@ ENTRY(hypervisor_call_table) .long SYMBOL_NAME(do_net_io_op) .long SYMBOL_NAME(do_fpu_taskswitch) .long SYMBOL_NAME(do_sched_op) + .long SYMBOL_NAME(do_set_timer_op) .long SYMBOL_NAME(do_dom0_op) .long SYMBOL_NAME(do_network_op) .long SYMBOL_NAME(do_block_io_op) diff --git a/xen/arch/i386/time.c b/xen/arch/i386/time.c index 1328f31fd4..8d328c34b6 100644 --- a/xen/arch/i386/time.c +++ b/xen/arch/i386/time.c @@ -37,12 +37,6 @@ #include #include -#ifdef TIME_TRACE -#define TRC(_x) _x -#else -#define TRC(_x) -#endif - extern rwlock_t xtime_lock; extern unsigned long wall_jiffies; diff --git a/xen/common/ac_timer.c b/xen/common/ac_timer.c index c3d2f51b37..4da998d45d 100644 --- a/xen/common/ac_timer.c +++ b/xen/common/ac_timer.c @@ -27,12 +27,6 @@ #include #include -#ifdef AC_TIMER_TRACE -#define TRC(_x) _x -#else -#define TRC(_x) -#endif - /* * We pull handlers off the timer list this far in future, * rather than reprogramming the time hardware. diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index c03aa908ab..3c92eb976f 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -1,5 +1,7 @@ + #include #include +#include #define KEY_MAX 256 #define STR_MAX 64 @@ -80,40 +82,48 @@ static void kill_dom0(u_char key, void *dev_id, struct pt_regs *regs) /* XXX SMH: this is keir's fault */ static char *task_states[] = { - "Runnable", - "Interruptible Sleep", - "Uninterruptible Sleep", - NULL, "Stopped", - NULL, NULL, NULL, "Dying", + "Runnable ", + "Int Sleep ", + "UInt Sleep", + NULL, + "Stopped ", + NULL, + NULL, + NULL, + "Dying ", }; void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs) { - unsigned long flags; + unsigned long flags, cpu_mask = 0; struct task_struct *p; shared_info_t *s; + s_time_t now = NOW(); - printk("'%c' pressed -> dumping task queues\n", key); + printk("'%c' pressed -> dumping task queues (now=0x%X:%08X)\n", key, + (u32)(now>>32), (u32)now); read_lock_irqsave(&tasklist_lock, flags); p = &idle0_task; do { printk("Xen: DOM %d, CPU %d [has=%c], state = %s, " - "hyp_events = %08x\n", - p->domain, p->processor, p->has_cpu ? 'T':'F', - task_states[p->state], p->hyp_events); - s = p->shared_info; - if( !is_idle_task(p) ) + "hyp_events = %08x\n", + p->domain, p->processor, p->has_cpu ? 'T':'F', + task_states[p->state], p->hyp_events); + s = p->shared_info; + if( !is_idle_task(p) ) { - printk("Guest: events = %08lx, events_mask = %08lx\n", - s->events, s->events_mask); - printk("Notifying guest...\n"); - set_bit(_EVENT_DEBUG, &s->events); - } + printk("Guest: events = %08lx, events_mask = %08lx\n", + s->events, s->events_mask); + printk("Notifying guest...\n"); + cpu_mask |= mark_guest_event(p, _EVENT_DEBUG); + } } while ( (p = p->next_task) != &idle0_task ); read_unlock_irqrestore(&tasklist_lock, flags); + + guest_event_notify(cpu_mask); } extern void perfc_printall (u_char key, void *dev_id, struct pt_regs *regs); diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 2b834d93e3..5352bbb6a6 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -5,7 +5,7 @@ **************************************************************************** * * File: common/schedule.c - * Author: Rolf Neugebar & Keir Fraser + * Author: Rolf Neugebauer & Keir Fraser * * Description: CPU scheduling * implements A Borrowed Virtual Time scheduler. @@ -24,16 +24,13 @@ #include #include -#undef SCHEDULER_TRACE -#ifdef SCHEDULER_TRACE -#define TRC(_x) _x -#else -#define TRC(_x) -#endif +/*#define WAKEUP_HISTO*/ +/*#define BLOCKTIME_HISTO*/ -/*#define SCHED_HISTO*/ -#ifdef SCHED_HISTO +#if defined(WAKEUP_HISTO) #define BUCKETS 31 +#elif defined(BLOCKTIME_HISTO) +#define BUCKETS 200 #endif #define MCU (s32)MICROSECS(100) /* Minimum unit */ @@ -48,7 +45,7 @@ typedef struct schedule_data_st struct task_struct *idle; /* idle task for this cpu */ u32 svt; /* system virtual time. per CPU??? */ struct ac_timer s_timer; /* scheduling timer */ -#ifdef SCHED_HISTO +#ifdef BUCKETS u32 hist[BUCKETS]; /* for scheduler latency histogram */ #endif } __cacheline_aligned schedule_data_t; @@ -56,19 +53,25 @@ static schedule_data_t schedule_data[NR_CPUS]; spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned; -/* Skanky periodic event to all guests. This must die in the next release! */ -static struct ac_timer v_timer; +/* Per-CPU periodic timer sends an event to the currently-executing domain. */ +static struct ac_timer t_timer[NR_CPUS]; /* - * Per-CPU timer to ensure that even guests with very long quantums get + * Per-CPU timer which ensures that even guests with very long quantums get * their time-of-day state updated often enough to avoid wrapping. */ static struct ac_timer fallback_timer[NR_CPUS]; -static void virt_timer(unsigned long foo); -static void dump_rqueue(struct list_head *queue, char *name); - +/* Various timer handlers. */ +static void s_timer_fn(unsigned long unused); +static void t_timer_fn(unsigned long unused); +static void dom_timer_fn(unsigned long data); +static void fallback_timer_fn(unsigned long unused); +/* + * Wrappers for run-queue management. Must be called with the schedule_lock + * held. + */ static inline void __add_to_runqueue_head(struct task_struct * p) { list_add(&p->run_list, &schedule_data[p->processor].runqueue); @@ -93,6 +96,10 @@ static inline int __task_on_runqueue(struct task_struct *p) #define next_domain(p) \\ list_entry((p)->run_list.next, struct task_struct, run_list) +/* + * Calculate the effective virtual time for a domain. Take into account + * warping limits + */ static void __calc_evt(struct task_struct *p) { s_time_t now = NOW(); @@ -134,14 +141,21 @@ void sched_add_domain(struct task_struct *p) } else { - /* set avt end evt to system virtual time */ + /* Set avt end evt to system virtual time. */ p->avt = schedule_data[p->processor].svt; p->evt = schedule_data[p->processor].svt; - /* set some default values here */ + /* Set some default values here. */ p->warpback = 0; p->warp = 0; p->warpl = 0; p->warpu = 0; + + /* Initialise the per-domain timer. */ + init_ac_timer(&p->timer); + p->timer.cpu = p->processor; + p->timer.data = (unsigned long)p; + p->timer.function = &dom_timer_fn; + } } @@ -187,7 +201,7 @@ void __wake_up(struct task_struct *p) p->warped = NOW(); __calc_evt(p); -#ifdef SCHED_HISTO +#ifdef WAKEUP_HISTO p->wokenup = NOW(); #endif } @@ -200,16 +214,31 @@ void wake_up(struct task_struct *p) spin_unlock_irqrestore(&schedule_lock[p->processor], flags); } -/* Voluntarily yield the processor to another domain, until an event occurs. */ -long do_yield(void) +/* + * Block the currently-executing domain until a pertinent event occurs. + */ +static long do_block(void) { + set_bit(EVENTS_MASTER_ENABLE_BIT, ¤t->shared_info->events_mask); current->state = TASK_INTERRUPTIBLE; - current->warpback = 0; /* XXX should only do this when blocking */ + current->warpback = 0; __enter_scheduler(); return 0; } -/* Demultiplex scheduler-related hypercalls. */ +/* + * Voluntarily yield the processor for this allocation. + */ +static long do_yield(void) +{ + __enter_scheduler(); + return 0; +} + + +/* + * Demultiplex scheduler-related hypercalls. + */ long do_sched_op(unsigned long op) { long ret = 0; @@ -223,14 +252,24 @@ long do_sched_op(unsigned long op) break; } + case SCHEDOP_block: + { + ret = do_block(); + break; + } + case SCHEDOP_exit: { + DPRINTK("DOM%d killed itself!\n", current->domain); + DPRINTK(" EIP == %08lx\n", get_execution_context()->eip); kill_domain(); break; } case SCHEDOP_stop: { + DPRINTK("DOM%d stopped itself!\n", current->domain); + DPRINTK(" EIP == %08lx\n", get_execution_context()->eip); stop_domain(); break; } @@ -242,6 +281,23 @@ long do_sched_op(unsigned long op) return ret; } +/* Per-domain one-shot-timer hypercall. */ +long do_set_timer_op(unsigned long timeout_hi, unsigned long timeout_lo) +{ + struct task_struct *p = current; + + rem_ac_timer(&p->timer); + + if ( (timeout_hi != 0) || (timeout_lo != 0) ) + { + p->timer.expires = ((s_time_t)timeout_hi<<32) | ((s_time_t)timeout_lo); + add_ac_timer(&p->timer); + } + + return 0; +} + + /* Control the scheduler. */ long sched_bvtctl(unsigned long c_allow) { @@ -330,7 +386,7 @@ asmlinkage void __enter_scheduler(void) { struct task_struct *prev = current, *next = NULL, *next_prime, *p; struct list_head *tmp; - int this_cpu = prev->processor; + int cpu = prev->processor; s_time_t now; s32 r_time; /* time for new dom to run */ s32 ranfor; /* assume we never run longer than 2.1s! */ @@ -339,11 +395,11 @@ asmlinkage void __enter_scheduler(void) perfc_incrc(sched_run); - spin_lock_irq(&schedule_lock[this_cpu]); + spin_lock_irq(&schedule_lock[cpu]); now = NOW(); - rem_ac_timer(&schedule_data[this_cpu].s_timer); + rem_ac_timer(&schedule_data[cpu].s_timer); ASSERT(!in_interrupt()); ASSERT(__task_on_runqueue(prev)); @@ -374,21 +430,21 @@ asmlinkage void __enter_scheduler(void) clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events); /* We should at least have the idle task */ - ASSERT(!list_empty(&schedule_data[this_cpu].runqueue)); + ASSERT(!list_empty(&schedule_data[cpu].runqueue)); /* * scan through the run queue and pick the task with the lowest evt * *and* the task the second lowest evt. * this code is O(n) but we expect n to be small. */ - next = schedule_data[this_cpu].idle; + next = schedule_data[cpu].idle; next_prime = NULL; next_evt = ~0U; next_prime_evt = ~0U; min_avt = ~0U; - list_for_each ( tmp, &schedule_data[this_cpu].runqueue ) + list_for_each ( tmp, &schedule_data[cpu].runqueue ) { p = list_entry(tmp, struct task_struct, run_list); if ( p->evt < next_evt ) @@ -416,16 +472,16 @@ asmlinkage void __enter_scheduler(void) /* Update system virtual time. */ if ( min_avt != ~0U ) - schedule_data[this_cpu].svt = min_avt; + schedule_data[cpu].svt = min_avt; /* check for virtual time overrun on this cpu */ - if ( schedule_data[this_cpu].svt >= 0xf0000000 ) + if ( schedule_data[cpu].svt >= 0xf0000000 ) { u_long t_flags; write_lock_irqsave(&tasklist_lock, t_flags); p = &idle0_task; do { - if ( (p->processor == this_cpu) && !is_idle_task(p) ) + if ( (p->processor == cpu) && !is_idle_task(p) ) { p->evt -= 0xe0000000; p->avt -= 0xe0000000; @@ -433,7 +489,7 @@ asmlinkage void __enter_scheduler(void) } while ( (p = p->next_task) != &idle0_task ); write_unlock_irqrestore(&tasklist_lock, t_flags); - schedule_data[this_cpu].svt -= 0xe0000000; + schedule_data[cpu].svt -= 0xe0000000; } /* work out time for next run through scheduler */ @@ -461,46 +517,43 @@ asmlinkage void __enter_scheduler(void) sched_done: ASSERT(r_time >= ctx_allow); -#ifndef NDEBUG - if ( r_time < ctx_allow ) - { - printk("[%02d]: %lx\n", this_cpu, (unsigned long)r_time); - dump_rqueue(&schedule_data[this_cpu].runqueue, "foo"); - } -#endif - prev->has_cpu = 0; next->has_cpu = 1; - schedule_data[this_cpu].curr = next; + schedule_data[cpu].curr = next; next->lastschd = now; /* reprogramm the timer */ - schedule_data[this_cpu].s_timer.expires = now + r_time; - add_ac_timer(&schedule_data[this_cpu].s_timer); + schedule_data[cpu].s_timer.expires = now + r_time; + add_ac_timer(&schedule_data[cpu].s_timer); + + spin_unlock_irq(&schedule_lock[cpu]); - spin_unlock_irq(&schedule_lock[this_cpu]); + /* Ensure that the domain has an up-to-date time base. */ + if ( !is_idle_task(next) ) + update_dom_time(next->shared_info); - /* done, switch tasks */ if ( unlikely(prev == next) ) - { - /* We won't go through the normal tail, so do this by hand */ - update_dom_time(prev->shared_info); return; - } perfc_incrc(sched_ctx); -#ifdef SCHED_HISTO + +#if defined(WAKEUP_HISTO) + if ( !is_idle_task(next) && next->wokenup ) { + ulong diff = (ulong)(now - next->wokenup); + diff /= (ulong)MILLISECS(1); + if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++; + else schedule_data[cpu].hist[BUCKETS-1]++; + } + next->wokenup = (s_time_t)0; +#elif defined(BLOCKTIME_HISTO) + prev->lastdeschd = now; + if ( !is_idle_task(next) ) { - ulong diff; /* should fit in 32bits */ - if (!is_idle_task(next) && next->wokenup) { - diff = (ulong)(now - next->wokenup); - diff /= (ulong)MILLISECS(1); - if (diff <= BUCKETS-2) schedule_data[this_cpu].hist[diff]++; - else schedule_data[this_cpu].hist[BUCKETS-1]++; - } - next->wokenup = (s_time_t)0; + ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10)); + if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++; + else schedule_data[cpu].hist[BUCKETS-1]++; } #endif @@ -509,8 +562,10 @@ asmlinkage void __enter_scheduler(void) if ( unlikely(prev->state == TASK_DYING) ) put_task_struct(prev); - update_dom_time(next->shared_info); - + /* Mark a timer event for the newly-scheduled domain. */ + if ( !is_idle_task(next) ) + set_bit(_EVENT_TIMER, &next->shared_info->events); + schedule_tail(next); BUG(); @@ -524,55 +579,57 @@ int idle_cpu(int cpu) } -/* The scheduler timer. */ -static void sched_timer(unsigned long unused) +/**************************************************************************** + * Timers: the scheduler utilises a number of timers + * - s_timer: per CPU timer for preemption and scheduling decisions + * - t_timer: per CPU periodic timer to send timer interrupt to current dom + * - dom_timer: per domain timer to specifiy timeout values + * - fallback_timer: safeguard to ensure time is up to date + ****************************************************************************/ + +/* The scheduler timer: force a run through the scheduler*/ +static void s_timer_fn(unsigned long unused) { - int cpu = smp_processor_id(); - struct task_struct *curr = schedule_data[cpu].curr; - /* cause a reschedule */ - set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); + set_bit(_HYP_EVENT_NEED_RESCHED, ¤t->hyp_events); perfc_incrc(sched_irq); } -/* The Domain virtual time timer */ -static void virt_timer(unsigned long unused) +/* Periodic tick timer: send timer event to current domain*/ +static void t_timer_fn(unsigned long unused) { - unsigned long flags, cpu_mask = 0; - struct task_struct *p; - s_time_t now; + struct task_struct *p = current; - /* send virtual timer interrupt */ - read_lock_irqsave(&tasklist_lock, flags); - p = &idle0_task; - do { - if ( is_idle_task(p) ) continue; - cpu_mask |= mark_guest_event(p, _EVENT_TIMER); - } - while ( (p = p->next_task) != &idle0_task ); - read_unlock_irqrestore(&tasklist_lock, flags); - guest_event_notify(cpu_mask); + if ( !is_idle_task(p) ) + set_bit(_EVENT_TIMER, &p->shared_info->events); - now = NOW(); - v_timer.expires = now + MILLISECS(20); - add_ac_timer(&v_timer); + t_timer[p->processor].expires = NOW() + MILLISECS(10); + add_ac_timer(&t_timer[p->processor]); +} + +/* Domain timer function, sends a virtual timer interrupt to domain */ +static void dom_timer_fn(unsigned long data) +{ + unsigned long cpu_mask = 0; + struct task_struct *p = (struct task_struct *)data; + + cpu_mask |= mark_guest_event(p, _EVENT_TIMER); + guest_event_notify(cpu_mask); } + /* Fallback timer to ensure guests get time updated 'often enough'. */ static void fallback_timer_fn(unsigned long unused) { struct task_struct *p = current; - unsigned int cpu = p->processor; if ( !is_idle_task(p) ) update_dom_time(p->shared_info); - fallback_timer[cpu].expires = NOW() + MILLISECS(500); - add_ac_timer(&fallback_timer[cpu]); + fallback_timer[p->processor].expires = NOW() + MILLISECS(500); + add_ac_timer(&fallback_timer[p->processor]); } -/* - * Initialise the data structures - */ +/* Initialise the data structures. */ void __init scheduler_init(void) { int i; @@ -588,20 +645,20 @@ void __init scheduler_init(void) init_ac_timer(&schedule_data[i].s_timer); schedule_data[i].s_timer.cpu = i; schedule_data[i].s_timer.data = 2; - schedule_data[i].s_timer.function = &sched_timer; + schedule_data[i].s_timer.function = &s_timer_fn; + + init_ac_timer(&t_timer[i]); + t_timer[i].cpu = i; + t_timer[i].data = 3; + t_timer[i].function = &t_timer_fn; init_ac_timer(&fallback_timer[i]); fallback_timer[i].cpu = i; - fallback_timer[i].data = 0; + fallback_timer[i].data = 4; fallback_timer[i].function = &fallback_timer_fn; } schedule_data[0].idle = &idle0_task; - - init_ac_timer(&v_timer); - v_timer.cpu = 0; - v_timer.data = 0; - v_timer.function = &virt_timer; } /* @@ -612,10 +669,11 @@ void schedulers_start(void) { printk("Start schedulers\n"); - virt_timer(0); + s_timer_fn(0); + smp_call_function((void *)s_timer_fn, NULL, 1, 1); - sched_timer(0); - smp_call_function((void *)sched_timer, NULL, 1, 1); + t_timer_fn(0); + smp_call_function((void *)t_timer_fn, NULL, 1, 1); fallback_timer_fn(0); smp_call_function((void *)fallback_timer_fn, NULL, 1, 1); @@ -668,7 +726,7 @@ void dump_runq(u_char key, void *dev_id, struct pt_regs *regs) return; } -#ifdef SCHED_HISTO +#if defined(WAKEUP_HISTO) || defined(BLOCKTIME_HISTO) void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs) { int loop, i, j; diff --git a/xen/drivers/block/xen_vbd.c b/xen/drivers/block/xen_vbd.c index 5570baff94..8a42026dac 100644 --- a/xen/drivers/block/xen_vbd.c +++ b/xen/drivers/block/xen_vbd.c @@ -89,7 +89,7 @@ long vbd_create(vbd_create_t *create) if ( unlikely((p = find_domain_by_id(create->domain)) == NULL) ) { DPRINTK("vbd_create attempted for non-existent domain %d\n", - domain); + create->domain); return -EINVAL; } diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h index ef29e292cb..ac1ecfd6f8 100644 --- a/xen/include/hypervisor-ifs/hypervisor-if.h +++ b/xen/include/hypervisor-ifs/hypervisor-if.h @@ -49,18 +49,19 @@ #define __HYPERVISOR_net_io_op 6 #define __HYPERVISOR_fpu_taskswitch 7 #define __HYPERVISOR_sched_op 8 -#define __HYPERVISOR_dom0_op 9 -#define __HYPERVISOR_network_op 10 -#define __HYPERVISOR_block_io_op 11 -#define __HYPERVISOR_set_debugreg 12 -#define __HYPERVISOR_get_debugreg 13 -#define __HYPERVISOR_update_descriptor 14 -#define __HYPERVISOR_set_fast_trap 15 -#define __HYPERVISOR_dom_mem_op 16 -#define __HYPERVISOR_multicall 17 -#define __HYPERVISOR_kbd_op 18 -#define __HYPERVISOR_update_va_mapping 19 -#define __HYPERVISOR_event_channel_op 20 +#define __HYPERVISOR_set_dom_timer 9 +#define __HYPERVISOR_dom0_op 10 +#define __HYPERVISOR_network_op 11 +#define __HYPERVISOR_block_io_op 12 +#define __HYPERVISOR_set_debugreg 13 +#define __HYPERVISOR_get_debugreg 14 +#define __HYPERVISOR_update_descriptor 15 +#define __HYPERVISOR_set_fast_trap 16 +#define __HYPERVISOR_dom_mem_op 17 +#define __HYPERVISOR_multicall 18 +#define __HYPERVISOR_kbd_op 19 +#define __HYPERVISOR_update_va_mapping 20 +#define __HYPERVISOR_event_channel_op 21 /* And the trap vector is... */ #define TRAP_INSTR "int $0x82" @@ -161,9 +162,10 @@ /* * SCHEDOP_* - Scheduler hypercall operations. */ -#define SCHEDOP_yield 0 -#define SCHEDOP_exit 1 -#define SCHEDOP_stop 2 +#define SCHEDOP_yield 0 /* Give up the CPU voluntarily. */ +#define SCHEDOP_block 1 /* Block until an event is received. */ +#define SCHEDOP_exit 3 /* Exit and kill this domain. */ +#define SCHEDOP_stop 4 /* Stop executing this domain. */ /* * EVTCHNOP_* - Event channel operations. diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h index 2e9cd0c563..bf1850ca5a 100644 --- a/xen/include/xeno/sched.h +++ b/xen/include/xeno/sched.h @@ -100,25 +100,27 @@ struct task_struct unsigned int tot_pages; /* number of pages currently possesed */ unsigned int max_pages; /* max number of pages that can be possesed */ - /* scheduling */ + /* Scheduling. */ struct list_head run_list; int has_cpu; - int state; /* current run state */ - int cpupinned; /* true if pinned to curent CPU */ - - s_time_t lastschd; /* time this domain was last scheduled */ - s_time_t cpu_time; /* total CPU time received till now */ - s_time_t wokenup; /* time domain got woken up */ - + int state; /* current run state */ + int cpupinned; /* true if pinned to curent CPU */ + s_time_t lastschd; /* time this domain was last scheduled */ + s_time_t lastdeschd; /* time this domain was last descheduled */ + s_time_t cpu_time; /* total CPU time received till now */ + s_time_t wokenup; /* time domain got woken up */ + struct ac_timer timer; /* one-shot timer for timeout values */ + + /* BVT scheduler specific. */ unsigned long mcu_advance; /* inverse of weight */ - u32 avt; /* actual virtual time */ - u32 evt; /* effective virtual time */ - int warpback; /* warp? */ - long warp; /* virtual time warp */ - long warpl; /* warp limit */ - long warpu; /* unwarp time requirement */ - s_time_t warped; /* time it ran warped last time */ - s_time_t uwarped; /* time it ran unwarped last time */ + u32 avt; /* actual virtual time */ + u32 evt; /* effective virtual time */ + int warpback; /* warp? */ + long warp; /* virtual time warp */ + long warpl; /* warp limit */ + long warpu; /* unwarp time requirement */ + s_time_t warped; /* time it ran warped last time */ + s_time_t uwarped; /* time it ran unwarped last time */ /* Network I/O */ net_vif_t *net_vif_list[MAX_DOMAIN_VIFS]; @@ -250,7 +252,6 @@ long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, void init_idle_task(void); void __wake_up(struct task_struct *p); void wake_up(struct task_struct *p); -long do_yield(void); unsigned long __reschedule(struct task_struct *p); void reschedule(struct task_struct *p); @@ -271,8 +272,9 @@ static inline long schedule_timeout(long timeout) return 0; } -#define signal_pending(_p) ((_p)->hyp_events || \ - (_p)->shared_info->events) +#define signal_pending(_p) \ + ((_p)->hyp_events || \ + ((_p)->shared_info->events & (_p)->shared_info->events_mask)) void domain_init(void); diff --git a/xen/net/dev.c b/xen/net/dev.c index 936d40f04c..963a65fbfb 100644 --- a/xen/net/dev.c +++ b/xen/net/dev.c @@ -1972,6 +1972,16 @@ static int get_tx_bufs(net_vif_t *vif) } else if ( (target == VIF_PHYS) || IS_PRIV(p) ) { + /* + * XXX HACK XXX: Our wildcard rule for domain-0 incorrectly puts + * some 169.254.* (ie. link-local) packets on the wire unless we + * include this explicit test. :-( + */ + if ( (ntohs(*(unsigned short *)(g_data + 12)) == ETH_P_IP) && + ((ntohl(*(unsigned long *)(g_data + 26)) & 0xFFFF0000) == + 0xA9FE0000) ) + goto disallow_linklocal_packets; + stx = &vif->tx_shadow_ring[MASK_NET_TX_IDX(j)]; stx->id = tx.id; stx->size = tx.size; @@ -1990,6 +2000,7 @@ static int get_tx_bufs(net_vif_t *vif) } else { + disallow_linklocal_packets: make_tx_response(vif, tx.id, RING_STATUS_DROPPED); } diff --git a/xenolinux-2.4.24-sparse/arch/xeno/config.in b/xenolinux-2.4.24-sparse/arch/xeno/config.in index 445b574a71..3f4736fd1f 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/config.in +++ b/xenolinux-2.4.24-sparse/arch/xeno/config.in @@ -13,9 +13,11 @@ define_bool CONFIG_SBUS n define_bool CONFIG_UID16 y mainmenu_option next_comment -comment 'Privileged guest OS' +comment 'XenoLinux' bool 'Support for privileged operations (domain 0)' CONFIG_XENO_PRIV endmenu +# the IBM S/390 patch needs this. +define_bool CONFIG_NO_IDLE_HZ y mainmenu_option next_comment comment 'Code maturity level options' diff --git a/xenolinux-2.4.24-sparse/arch/xeno/defconfig b/xenolinux-2.4.24-sparse/arch/xeno/defconfig index abef573aa7..3ba185a19b 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/defconfig +++ b/xenolinux-2.4.24-sparse/arch/xeno/defconfig @@ -8,9 +8,12 @@ CONFIG_ISA=y CONFIG_UID16=y # -# Privileged guest OS +# XenoLinux Options # +# support for priviledged domains CONFIG_XENO_PRIV=y +# On demand timer setting (taken from s390 patch set) +CONFIG_NO_IDLE_HZ=y # # Code maturity level options diff --git a/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c b/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c index ac557a3c11..075acdf5af 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c +++ b/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c @@ -81,15 +81,15 @@ static void _dbg_network_int(struct net_device *dev) if ( np->state == STATE_CLOSED ) return; - printk(KERN_ALERT "tx_full = %d, tx_resp_cons = 0x%08x," - " tx_req_prod = 0x%08x, tx_resp_prod = 0x%08x," - " tx_event = 0x%08x, state=%d\n", + printk(KERN_ALERT "net: tx_full=%d, tx_resp_cons=0x%08x," + " tx_req_prod=0x%08x\nnet: tx_resp_prod=0x%08x," + " tx_event=0x%08x, state=%d\n", np->tx_full, np->tx_resp_cons, np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, np->net_idx->tx_event, test_bit(__LINK_STATE_XOFF, &dev->state)); - printk(KERN_ALERT "rx_resp_cons = 0x%08x," - " rx_req_prod = 0x%08x, rx_resp_prod = 0x%08x, rx_event = 0x%08x\n", + printk(KERN_ALERT "net: rx_resp_cons=0x%08x," + " rx_req_prod=0x%08x\nnet: rx_resp_prod=0x%08x, rx_event=0x%08x\n", np->rx_resp_cons, np->net_idx->rx_req_prod, np->net_idx->rx_resp_prod, np->net_idx->rx_event); } @@ -550,7 +550,8 @@ int __init init_module(void) goto fail; } - err = request_irq(_EVENT_DEBUG, dbg_network_int, 0, "debug", NULL); + err = request_irq(_EVENT_DEBUG, dbg_network_int, SA_SHIRQ, "net_dbg", + &dbg_network_int); if ( err ) printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n"); diff --git a/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c b/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c index 3b17c7326c..ff64bccd4c 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c +++ b/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c @@ -80,14 +80,36 @@ void enable_hlt(void) */ void cpu_idle (void) { - /* endless idle loop with no priority at all */ + extern int set_timeout_timer(void); + + /* Endless idle loop with no priority at all. */ init_idle(); current->nice = 20; current->counter = -100; - while (1) { - while (!current->need_resched) - HYPERVISOR_yield(); + for ( ; ; ) + { + while ( !current->need_resched ) + { + __cli(); + if ( current->need_resched ) + { + /* The race-free check for events failed. */ + __sti(); + break; + } + else if ( set_timeout_timer() == 0 ) + { + /* NB. Blocking reenable events in a race-free manner. */ + HYPERVISOR_block(); + } + else + { + /* No race here: yielding will get us the CPU again anyway. */ + __sti(); + HYPERVISOR_yield(); + } + } schedule(); check_pgt_cache(); } diff --git a/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c b/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c index 1944e63c1c..bf43b6a99b 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c +++ b/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c @@ -75,7 +75,7 @@ static u32 st_scale_i; /* convert ticks -> usecs */ /* These are peridically updated in shared_info, and then copied here. */ static u32 shadow_tsc_stamp; -static s64 shadow_system_time; +static u64 shadow_system_time; static u32 shadow_time_version; static struct timeval shadow_tv; @@ -91,9 +91,12 @@ static long last_update_to_rtc, last_update_to_xen; #endif /* Periodically take synchronised time base from Xen, if we need it. */ -static long last_update_from_xen; +static long last_update_from_xen; /* UTC seconds when last read Xen clock. */ -static u64 processed_system_time; +/* Keep track of last time we did processing/updating of jiffies and xtime. */ +static u64 processed_system_time; /* System time (ns) at last processing. */ + +#define NS_PER_TICK (1000000000ULL/HZ) #define HANDLE_USEC_UNDERFLOW(_tv) \ do { \ @@ -197,8 +200,11 @@ static int set_rtc_mmss(unsigned long nowtime) #endif -/* Must be called with the xtime_lock held for writing. */ -static void get_time_values_from_xen(void) +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called with the xtime_lock held for writing. + */ +static void __get_time_values_from_xen(void) { do { shadow_time_version = HYPERVISOR_shared_info->time_version2; @@ -216,7 +222,11 @@ static void get_time_values_from_xen(void) (shadow_time_version == HYPERVISOR_shared_info->time_version2) -static inline unsigned long get_time_delta_usecs(void) +/* + * Returns the system time elapsed, in ns, since the current shadow_timestamp + * was calculated. Must be called with the xtime_lock held for reading. + */ +static inline unsigned long __get_time_delta_usecs(void) { s32 delta_tsc; u32 low; @@ -234,6 +244,9 @@ static inline unsigned long get_time_delta_usecs(void) } +/* + * Returns the current time-of-day in UTC timeval format. + */ void do_gettimeofday(struct timeval *tv) { unsigned long flags, lost; @@ -242,7 +255,7 @@ void do_gettimeofday(struct timeval *tv) again: read_lock_irqsave(&xtime_lock, flags); - _tv.tv_usec = get_time_delta_usecs(); + _tv.tv_usec = __get_time_delta_usecs(); if ( (lost = (jiffies - wall_jiffies)) != 0 ) _tv.tv_usec += lost * (1000000 / HZ); _tv.tv_sec = xtime.tv_sec; @@ -257,7 +270,7 @@ void do_gettimeofday(struct timeval *tv) */ read_unlock_irqrestore(&xtime_lock, flags); write_lock_irqsave(&xtime_lock, flags); - get_time_values_from_xen(); + __get_time_values_from_xen(); write_unlock_irqrestore(&xtime_lock, flags); goto again; } @@ -276,6 +289,10 @@ void do_gettimeofday(struct timeval *tv) *tv = _tv; } + +/* + * Sets the current time-of-day based on passed-in UTC timeval parameter. + */ void do_settimeofday(struct timeval *tv) { struct timeval newtv; @@ -291,10 +308,10 @@ void do_settimeofday(struct timeval *tv) * be stale, so we can retry with fresh ones. */ again: - tv->tv_usec -= get_time_delta_usecs(); + tv->tv_usec -= __get_time_delta_usecs(); if ( unlikely(!TIME_VALUES_UP_TO_DATE) ) { - get_time_values_from_xen(); + __get_time_values_from_xen(); goto again; } @@ -334,6 +351,7 @@ void do_settimeofday(struct timeval *tv) } } + asmlinkage long sys_stime(int *tptr) { int value; @@ -353,14 +371,22 @@ asmlinkage long sys_stime(int *tptr) return 0; } -#define NS_PER_TICK (1000000000ULL/HZ) + +/* Convert jiffies to system time. Call with xtime_lock held for reading. */ +static inline u64 __jiffies_to_st(unsigned long j) +{ + return processed_system_time + ((j - jiffies) * NS_PER_TICK); +} + + static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { s64 delta; + unsigned long ticks = 0; long sec_diff; - get_time_values_from_xen(); + __get_time_values_from_xen(); if ( (delta = (s64)(shadow_system_time - processed_system_time)) < 0 ) { @@ -368,13 +394,24 @@ static inline void do_timer_interrupt(int irq, void *dev_id, return; } + /* Process elapsed jiffies since last call. */ while ( delta >= NS_PER_TICK ) { - do_timer(regs); + ticks++; delta -= NS_PER_TICK; processed_system_time += NS_PER_TICK; } - + + if ( ticks != 0 ) + { + do_timer_ticks(ticks); + + if ( user_mode(regs) ) + update_process_times_us(ticks, 0); + else + update_process_times_us(0, ticks); + } + /* * Take synchronised time from Xen once a minute if we're not * synchronised ourselves, and we haven't chosen to keep an independent @@ -446,6 +483,7 @@ static inline void do_timer_interrupt(int irq, void *dev_id, #endif } + static void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { write_lock(&xtime_lock); @@ -463,6 +501,89 @@ static struct irqaction irq_timer = { NULL }; + +/* + * This function works out when the the next timer function has to be + * executed (by looking at the timer list) and sets the Xen one-shot + * domain timer to the appropriate value. This is typically called in + * cpu_idle() before the domain blocks. + * + * The function returns a non-0 value on error conditions. + * + * It must be called with interrupts disabled. + */ +extern spinlock_t timerlist_lock; +int set_timeout_timer(void) +{ + struct timer_list *timer; + u64 alarm = 0; + int ret = 0; + + spin_lock(&timerlist_lock); + + /* + * This is safe against long blocking (since calculations are not based on + * TSC deltas). It is also safe against warped system time since + * suspend-resume is cooperative and we would first get locked out. It is + * safe against normal updates of jiffies since interrupts are off. + */ + if ( (timer = next_timer_event()) != NULL ) + alarm = __jiffies_to_st(timer->expires); + + /* Failure is pretty bad, but we'd best soldier on. */ + if ( HYPERVISOR_set_dom_timer(alarm) != 0 ) + ret = -1; + + spin_unlock(&timerlist_lock); + + return ret; +} + + +/* Time debugging. */ +static void dbg_time_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + unsigned long flags, j; + u64 s_now, j_st; + struct timeval s_tv, tv; + + struct timer_list *timer; + u64 t_st; + + read_lock_irqsave(&xtime_lock, flags); + s_tv.tv_sec = shadow_tv.tv_sec; + s_tv.tv_usec = shadow_tv.tv_usec; + s_now = shadow_system_time; + read_unlock_irqrestore(&xtime_lock, flags); + + do_gettimeofday(&tv); + + j = jiffies; + j_st = __jiffies_to_st(j); + + timer = next_timer_event(); + t_st = __jiffies_to_st(timer->expires); + + printk(KERN_ALERT "time: shadow_st=0x%X:%08X\n", + (u32)(s_now>>32), (u32)s_now); + printk(KERN_ALERT "time: wct=%lds %ldus shadow_wct=%lds %ldus\n", + tv.tv_sec, tv.tv_usec, s_tv.tv_sec, s_tv.tv_usec); + printk(KERN_ALERT "time: jiffies=%lu(0x%X:%08X) timeout=%lu(0x%X:%08X)\n", + jiffies,(u32)(j_st>>32), (u32)j_st, + timer->expires,(u32)(t_st>>32), (u32)t_st); + printk(KERN_ALERT "time: processed_system_time=0x%X:%08X\n", + (u32)(processed_system_time>>32), (u32)processed_system_time); +} + +static struct irqaction dbg_time = { + dbg_time_int, + SA_SHIRQ, + 0, + "timer_dbg", + &dbg_time_int, + NULL +}; + void __init time_init(void) { unsigned long long alarm; @@ -494,10 +615,12 @@ void __init time_init(void) st_scale_f = scale & 0xffffffff; st_scale_i = scale >> 32; - get_time_values_from_xen(); + __get_time_values_from_xen(); processed_system_time = shadow_system_time; - setup_irq(TIMER_IRQ, &irq_timer); + (void)setup_irq(TIMER_IRQ, &irq_timer); + + (void)setup_irq(_EVENT_DEBUG, &dbg_time); rdtscll(alarm); diff --git a/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h b/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h index 064088ff6f..34272a624f 100644 --- a/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h +++ b/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h @@ -256,6 +256,17 @@ static inline int HYPERVISOR_yield(void) return ret; } +static inline int HYPERVISOR_block(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_block) ); + + return ret; +} + static inline int HYPERVISOR_exit(void) { int ret; @@ -279,6 +290,19 @@ static inline int HYPERVISOR_stop(unsigned long srec) return ret; } +static inline long HYPERVISOR_set_dom_timer(u64 timeout) +{ + int ret; + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_dom_timer), + "b" (timeout_hi), "c" (timeout_lo) : "memory" ); + + return ret; +} + static inline int HYPERVISOR_dom0_op(dom0_op_t *dom0_op) { int ret; diff --git a/xenolinux-2.4.24-sparse/include/linux/sched.h b/xenolinux-2.4.24-sparse/include/linux/sched.h new file mode 100644 index 0000000000..ed42340517 --- /dev/null +++ b/xenolinux-2.4.24-sparse/include/linux/sched.h @@ -0,0 +1,966 @@ +#ifndef _LINUX_SCHED_H +#define _LINUX_SCHED_H + +#include /* for HZ */ + +extern unsigned long event; + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +struct exec_domain; + +/* + * cloning flags: + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PID 0x00001000 /* set if pid shared */ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New namespace group? */ + +#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) + +/* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ +extern unsigned long avenrun[]; /* Load averages */ + +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<>= FSHIFT; + +#define CT_TO_SECS(x) ((x) / HZ) +#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) + +extern int nr_running, nr_threads; +extern int last_pid; + +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +#include + +#define TASK_RUNNING 0 +#define TASK_INTERRUPTIBLE 1 +#define TASK_UNINTERRUPTIBLE 2 +#define TASK_ZOMBIE 4 +#define TASK_STOPPED 8 + +#define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +#define set_task_state(tsk, state_value) \ + set_mb((tsk)->state, (state_value)) + +#define __set_current_state(state_value) \ + do { current->state = (state_value); } while (0) +#define set_current_state(state_value) \ + set_mb(current->state, (state_value)) + +/* + * Scheduling policies + */ +#define SCHED_OTHER 0 +#define SCHED_FIFO 1 +#define SCHED_RR 2 + +/* + * This is an additional bit set when we want to + * yield the CPU for one re-schedule.. + */ +#define SCHED_YIELD 0x10 + +struct sched_param { + int sched_priority; +}; + +struct completion; + +#ifdef __KERNEL__ + +#include + +/* + * This serializes "schedule()" and also protects + * the run-queue from deletions/modifications (but + * _adding_ to the beginning of the run-queue has + * a separate lock). + */ +extern rwlock_t tasklist_lock; +extern spinlock_t runqueue_lock; +extern spinlock_t mmlist_lock; + +extern void sched_init(void); +extern void init_idle(void); +extern void show_state(void); +extern void cpu_init (void); +extern void trap_init(void); +extern void update_process_times(int user); +#ifdef CONFIG_NO_IDLE_HZ +extern void update_process_times_us(int user, int system); +#endif +extern void update_one_process(struct task_struct *p, unsigned long user, + unsigned long system, int cpu); + +#define MAX_SCHEDULE_TIMEOUT LONG_MAX +extern signed long FASTCALL(schedule_timeout(signed long timeout)); +asmlinkage void schedule(void); + +extern int schedule_task(struct tq_struct *task); +extern void flush_scheduled_tasks(void); +extern int start_context_thread(void); +extern int current_is_keventd(void); + +#if CONFIG_SMP +extern void set_cpus_allowed(struct task_struct *p, unsigned long new_mask); +#else +# define set_cpus_allowed(p, new_mask) do { } while (0) +#endif + +/* + * The default fd array needs to be at least BITS_PER_LONG, + * as this is the granularity returned by copy_fdset(). + */ +#define NR_OPEN_DEFAULT BITS_PER_LONG + +struct namespace; +/* + * Open file table structure + */ +struct files_struct { + atomic_t count; + rwlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */ + int max_fds; + int max_fdset; + int next_fd; + struct file ** fd; /* current fd array */ + fd_set *close_on_exec; + fd_set *open_fds; + fd_set close_on_exec_init; + fd_set open_fds_init; + struct file * fd_array[NR_OPEN_DEFAULT]; +}; + +#define INIT_FILES \ +{ \ + count: ATOMIC_INIT(1), \ + file_lock: RW_LOCK_UNLOCKED, \ + max_fds: NR_OPEN_DEFAULT, \ + max_fdset: __FD_SETSIZE, \ + next_fd: 0, \ + fd: &init_files.fd_array[0], \ + close_on_exec: &init_files.close_on_exec_init, \ + open_fds: &init_files.open_fds_init, \ + close_on_exec_init: { { 0, } }, \ + open_fds_init: { { 0, } }, \ + fd_array: { NULL, } \ +} + +/* Maximum number of active map areas.. This is a random (large) number */ +#define DEFAULT_MAX_MAP_COUNT (65536) + +extern int max_map_count; + +struct mm_struct { + struct vm_area_struct * mmap; /* list of VMAs */ + rb_root_t mm_rb; + struct vm_area_struct * mmap_cache; /* last find_vma result */ + pgd_t * pgd; + atomic_t mm_users; /* How many users with user space? */ + atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + int map_count; /* number of VMAs */ + struct rw_semaphore mmap_sem; + spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ + + struct list_head mmlist; /* List of all active mm's. These are globally strung + * together off init_mm.mmlist, and are protected + * by mmlist_lock + */ + + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long rss, total_vm, locked_vm; + unsigned long def_flags; + unsigned long cpu_vm_mask; + unsigned long swap_address; + + unsigned dumpable:1; + + /* Architecture-specific MM context */ + mm_context_t context; +}; + +extern int mmlist_nr; + +#define INIT_MM(name) \ +{ \ + mm_rb: RB_ROOT, \ + pgd: swapper_pg_dir, \ + mm_users: ATOMIC_INIT(2), \ + mm_count: ATOMIC_INIT(1), \ + mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ + page_table_lock: SPIN_LOCK_UNLOCKED, \ + mmlist: LIST_HEAD_INIT(name.mmlist), \ +} + +struct signal_struct { + atomic_t count; + struct k_sigaction action[_NSIG]; + spinlock_t siglock; +}; + + +#define INIT_SIGNALS { \ + count: ATOMIC_INIT(1), \ + action: { {{0,}}, }, \ + siglock: SPIN_LOCK_UNLOCKED \ +} + +/* + * Some day this will be a full-fledged user tracking system.. + */ +struct user_struct { + atomic_t __count; /* reference count */ + atomic_t processes; /* How many processes does this user have? */ + atomic_t files; /* How many open files does this user have? */ + + /* Hash table maintenance information */ + struct user_struct *next, **pprev; + uid_t uid; +}; + +#define get_current_user() ({ \ + struct user_struct *__user = current->user; \ + atomic_inc(&__user->__count); \ + __user; }) + +extern struct user_struct root_user; +#define INIT_USER (&root_user) + +struct task_struct { + /* + * offsets of these are hardcoded elsewhere - touch with care + */ + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ + unsigned long flags; /* per process flags, defined below */ + int sigpending; + mm_segment_t addr_limit; /* thread address space: + 0-0xBFFFFFFF for user-thead + 0-0xFFFFFFFF for kernel-thread + */ + struct exec_domain *exec_domain; + volatile long need_resched; + unsigned long ptrace; + + int lock_depth; /* Lock depth */ + +/* + * offset 32 begins here on 32-bit platforms. We keep + * all fields in a single cacheline that are needed for + * the goodness() loop in schedule(). + */ + long counter; + long nice; + unsigned long policy; + struct mm_struct *mm; + int processor; + /* + * cpus_runnable is ~0 if the process is not running on any + * CPU. It's (1 << cpu) if it's running on a CPU. This mask + * is updated under the runqueue lock. + * + * To determine whether a process might run on a CPU, this + * mask is AND-ed with cpus_allowed. + */ + unsigned long cpus_runnable, cpus_allowed; + /* + * (only the 'next' pointer fits into the cacheline, but + * that's just fine.) + */ + struct list_head run_list; + unsigned long sleep_time; + + struct task_struct *next_task, *prev_task; + struct mm_struct *active_mm; + struct list_head local_pages; + unsigned int allocation_order, nr_local_pages; + +/* task state */ + struct linux_binfmt *binfmt; + int exit_code, exit_signal; + int pdeath_signal; /* The signal sent when the parent dies */ + /* ??? */ + unsigned long personality; + int did_exec:1; + unsigned task_dumpable:1; + pid_t pid; + pid_t pgrp; + pid_t tty_old_pgrp; + pid_t session; + pid_t tgid; + /* boolean value for session group leader */ + int leader; + /* + * pointers to (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with + * p->p_pptr->pid) + */ + struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; + struct list_head thread_group; + + /* PID hash table linkage. */ + struct task_struct *pidhash_next; + struct task_struct **pidhash_pprev; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + struct completion *vfork_done; /* for vfork() */ + unsigned long rt_priority; + unsigned long it_real_value, it_prof_value, it_virt_value; + unsigned long it_real_incr, it_prof_incr, it_virt_incr; + struct timer_list real_timer; + struct tms times; + unsigned long start_time; + long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; +/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ + unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + int swappable:1; +/* process credentials */ + uid_t uid,euid,suid,fsuid; + gid_t gid,egid,sgid,fsgid; + int ngroups; + gid_t groups[NGROUPS]; + kernel_cap_t cap_effective, cap_inheritable, cap_permitted; + int keep_capabilities:1; + struct user_struct *user; +/* limits */ + struct rlimit rlim[RLIM_NLIMITS]; + unsigned short used_math; + char comm[16]; +/* file system info */ + int link_count, total_link_count; + struct tty_struct *tty; /* NULL if no tty */ + unsigned int locks; /* How many file locks are being held */ +/* ipc stuff */ + struct sem_undo *semundo; + struct sem_queue *semsleeping; +/* CPU-specific state of this task */ + struct thread_struct thread; +/* filesystem information */ + struct fs_struct *fs; +/* open file information */ + struct files_struct *files; +/* namespace */ + struct namespace *namespace; +/* signal handlers */ + spinlock_t sigmask_lock; /* Protects signal and blocked */ + struct signal_struct *sig; + + sigset_t blocked; + struct sigpending pending; + + unsigned long sas_ss_sp; + size_t sas_ss_size; + int (*notifier)(void *priv); + void *notifier_data; + sigset_t *notifier_mask; + +/* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; +/* Protection of (de-)allocation: mm, files, fs, tty */ + spinlock_t alloc_lock; + +/* journalling filesystem info */ + void *journal_info; +}; + +/* + * Per process flags + */ +#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ + /* Not implemented yet, only for 486*/ +#define PF_STARTING 0x00000002 /* being created */ +#define PF_EXITING 0x00000004 /* getting shut down */ +#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ +#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ +#define PF_DUMPCORE 0x00000200 /* dumped core */ +#define PF_SIGNALED 0x00000400 /* killed by a signal */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_FREE_PAGES 0x00002000 /* per process page freeing */ +#define PF_NOIO 0x00004000 /* avoid generating further I/O */ + +#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ + +/* + * Ptrace flags + */ + +#define PT_PTRACED 0x00000001 +#define PT_TRACESYS 0x00000002 +#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */ +#define PT_TRACESYSGOOD 0x00000008 +#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */ + +#define is_dumpable(tsk) ((tsk)->task_dumpable && (tsk)->mm && (tsk)->mm->dumpable) + +/* + * Limit the stack by to some sane default: root can always + * increase this limit if needed.. 8MB seems reasonable. + */ +#define _STK_LIM (8*1024*1024) + +#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ +#define MAX_COUNTER (20*HZ/100) +#define DEF_NICE (0) + +extern void yield(void); + +/* + * The default (Linux) execution domain. + */ +extern struct exec_domain default_exec_domain; + +/* + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) + */ +#define INIT_TASK(tsk) \ +{ \ + state: 0, \ + flags: 0, \ + sigpending: 0, \ + addr_limit: KERNEL_DS, \ + exec_domain: &default_exec_domain, \ + lock_depth: -1, \ + counter: DEF_COUNTER, \ + nice: DEF_NICE, \ + policy: SCHED_OTHER, \ + mm: NULL, \ + active_mm: &init_mm, \ + cpus_runnable: ~0UL, \ + cpus_allowed: ~0UL, \ + run_list: LIST_HEAD_INIT(tsk.run_list), \ + next_task: &tsk, \ + prev_task: &tsk, \ + p_opptr: &tsk, \ + p_pptr: &tsk, \ + thread_group: LIST_HEAD_INIT(tsk.thread_group), \ + wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ + real_timer: { \ + function: it_real_fn \ + }, \ + cap_effective: CAP_INIT_EFF_SET, \ + cap_inheritable: CAP_INIT_INH_SET, \ + cap_permitted: CAP_FULL_SET, \ + keep_capabilities: 0, \ + rlim: INIT_RLIMITS, \ + user: INIT_USER, \ + comm: "swapper", \ + thread: INIT_THREAD, \ + fs: &init_fs, \ + files: &init_files, \ + sigmask_lock: SPIN_LOCK_UNLOCKED, \ + sig: &init_signals, \ + pending: { NULL, &tsk.pending.head, {{0}}}, \ + blocked: {{0}}, \ + alloc_lock: SPIN_LOCK_UNLOCKED, \ + journal_info: NULL, \ +} + + +#ifndef INIT_TASK_SIZE +# define INIT_TASK_SIZE 2048*sizeof(long) +#endif + +union task_union { + struct task_struct task; + unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; +}; + +extern union task_union init_task_union; + +extern struct mm_struct init_mm; +extern struct task_struct *init_tasks[NR_CPUS]; + +/* PID hashing. (shouldnt this be dynamic?) */ +#define PIDHASH_SZ (4096 >> 2) +extern struct task_struct *pidhash[PIDHASH_SZ]; + +#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) + +static inline void hash_pid(struct task_struct *p) +{ + struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; + + if((p->pidhash_next = *htable) != NULL) + (*htable)->pidhash_pprev = &p->pidhash_next; + *htable = p; + p->pidhash_pprev = htable; +} + +static inline void unhash_pid(struct task_struct *p) +{ + if(p->pidhash_next) + p->pidhash_next->pidhash_pprev = p->pidhash_pprev; + *p->pidhash_pprev = p->pidhash_next; +} + +static inline struct task_struct *find_task_by_pid(int pid) +{ + struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; + + for(p = *htable; p && p->pid != pid; p = p->pidhash_next) + ; + + return p; +} + +#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) + +static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) +{ + tsk->processor = cpu; + tsk->cpus_runnable = 1UL << cpu; +} + +static inline void task_release_cpu(struct task_struct *tsk) +{ + tsk->cpus_runnable = ~0UL; +} + +/* per-UID process charging. */ +extern struct user_struct * alloc_uid(uid_t); +extern void free_uid(struct user_struct *); +extern void switch_uid(struct user_struct *); + +#include + +extern unsigned long volatile jiffies; +extern unsigned long itimer_ticks; +extern unsigned long itimer_next; +extern struct timeval xtime; +extern void do_timer(struct pt_regs *); +#ifdef CONFIG_NO_IDLE_HZ +extern void do_timer_ticks(int ticks); +#endif + +extern unsigned int * prof_buffer; +extern unsigned long prof_len; +extern unsigned long prof_shift; + +#define CURRENT_TIME (xtime.tv_sec) + +extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr)); +extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); +extern void FASTCALL(sleep_on(wait_queue_head_t *q)); +extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); +extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); +extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); +extern int FASTCALL(wake_up_process(struct task_struct * tsk)); + +#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) +#define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) +#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) +#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) +#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) +#define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) +#define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) +#define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) +asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); + +extern int in_group_p(gid_t); +extern int in_egroup_p(gid_t); + +extern void proc_caches_init(void); +extern void flush_signals(struct task_struct *); +extern void flush_signal_handlers(struct task_struct *); +extern void sig_exit(int, int, struct siginfo *); +extern int dequeue_signal(sigset_t *, siginfo_t *); +extern void block_all_signals(int (*notifier)(void *priv), void *priv, + sigset_t *mask); +extern void unblock_all_signals(void); +extern int send_sig_info(int, struct siginfo *, struct task_struct *); +extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int kill_pg_info(int, struct siginfo *, pid_t); +extern int kill_sl_info(int, struct siginfo *, pid_t); +extern int kill_proc_info(int, struct siginfo *, pid_t); +extern void notify_parent(struct task_struct *, int); +extern void do_notify_parent(struct task_struct *, int); +extern void force_sig(int, struct task_struct *); +extern int send_sig(int, struct task_struct *, int); +extern int kill_pg(pid_t, int, int); +extern int kill_sl(pid_t, int, int); +extern int kill_proc(pid_t, int, int); +extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); +extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); + +static inline int signal_pending(struct task_struct *p) +{ + return (p->sigpending != 0); +} + +/* + * Re-calculate pending state from the set of locally pending + * signals, globally pending signals, and blocked signals. + */ +static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) +{ + unsigned long ready; + long i; + + switch (_NSIG_WORDS) { + default: + for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) + ready |= signal->sig[i] &~ blocked->sig[i]; + break; + + case 4: ready = signal->sig[3] &~ blocked->sig[3]; + ready |= signal->sig[2] &~ blocked->sig[2]; + ready |= signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 2: ready = signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 1: ready = signal->sig[0] &~ blocked->sig[0]; + } + return ready != 0; +} + +/* Reevaluate whether the task has signals pending delivery. + This is required every time the blocked sigset_t changes. + All callers should have t->sigmask_lock. */ + +static inline void recalc_sigpending(struct task_struct *t) +{ + t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); +} + +/* True if we are on the alternate signal stack. */ + +static inline int on_sig_stack(unsigned long sp) +{ + return (sp - current->sas_ss_sp < current->sas_ss_size); +} + +static inline int sas_ss_flags(unsigned long sp) +{ + return (current->sas_ss_size == 0 ? SS_DISABLE + : on_sig_stack(sp) ? SS_ONSTACK : 0); +} + +extern int request_irq(unsigned int, + void (*handler)(int, void *, struct pt_regs *), + unsigned long, const char *, void *); +extern void free_irq(unsigned int, void *); + +/* + * This has now become a routine instead of a macro, it sets a flag if + * it returns true (to do BSD-style accounting where the process is flagged + * if it uses root privs). The implication of this is that you should do + * normal permissions checks first, and check suser() last. + * + * [Dec 1997 -- Chris Evans] + * For correctness, the above considerations need to be extended to + * fsuser(). This is done, along with moving fsuser() checks to be + * last. + * + * These will be removed, but in the mean time, when the SECURE_NOROOT + * flag is set, uids don't grant privilege. + */ +static inline int suser(void) +{ + if (!issecure(SECURE_NOROOT) && current->euid == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} + +static inline int fsuser(void) +{ + if (!issecure(SECURE_NOROOT) && current->fsuid == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} + +/* + * capable() checks for a particular capability. + * New privilege checks should use this interface, rather than suser() or + * fsuser(). See include/linux/capability.h for defined capabilities. + */ + +static inline int capable(int cap) +{ +#if 1 /* ok now */ + if (cap_raised(current->cap_effective, cap)) +#else + if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0) +#endif + { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} + +/* + * Routines for handling mm_structs + */ +extern struct mm_struct * mm_alloc(void); + +extern struct mm_struct * start_lazy_tlb(void); +extern void end_lazy_tlb(struct mm_struct *mm); + +/* mmdrop drops the mm and the page tables */ +extern inline void FASTCALL(__mmdrop(struct mm_struct *)); +static inline void mmdrop(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop(mm); +} + +/* mmput gets rid of the mappings and all user-space */ +extern void mmput(struct mm_struct *); +/* Remove the current tasks stale references to the old mm_struct */ +extern void mm_release(void); + +/* + * Routines for handling the fd arrays + */ +extern struct file ** alloc_fd_array(int); +extern int expand_fd_array(struct files_struct *, int nr); +extern void free_fd_array(struct file **, int); + +extern fd_set *alloc_fdset(int); +extern int expand_fdset(struct files_struct *, int nr); +extern void free_fdset(fd_set *, int); + +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern void flush_thread(void); +extern void exit_thread(void); + +extern void exit_mm(struct task_struct *); +extern void exit_files(struct task_struct *); +extern void exit_sighand(struct task_struct *); + +extern void reparent_to_init(void); +extern void daemonize(void); + +extern int do_execve(char *, char **, char **, struct pt_regs *); +extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); + +extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); + +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); + +#define __wait_event(wq, condition) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + schedule(); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event(wq, condition) \ +do { \ + if (condition) \ + break; \ + __wait_event(wq, condition); \ +} while (0) + +#define __wait_event_interruptible(wq, condition, ret) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (condition) \ + break; \ + if (!signal_pending(current)) { \ + schedule(); \ + continue; \ + } \ + ret = -ERESTARTSYS; \ + break; \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_interruptible(wq, condition) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __wait_event_interruptible(wq, condition, __ret); \ + __ret; \ +}) + +#define REMOVE_LINKS(p) do { \ + (p)->next_task->prev_task = (p)->prev_task; \ + (p)->prev_task->next_task = (p)->next_task; \ + if ((p)->p_osptr) \ + (p)->p_osptr->p_ysptr = (p)->p_ysptr; \ + if ((p)->p_ysptr) \ + (p)->p_ysptr->p_osptr = (p)->p_osptr; \ + else \ + (p)->p_pptr->p_cptr = (p)->p_osptr; \ + } while (0) + +#define SET_LINKS(p) do { \ + (p)->next_task = &init_task; \ + (p)->prev_task = init_task.prev_task; \ + init_task.prev_task->next_task = (p); \ + init_task.prev_task = (p); \ + (p)->p_ysptr = NULL; \ + if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \ + (p)->p_osptr->p_ysptr = p; \ + (p)->p_pptr->p_cptr = p; \ + } while (0) + +#define for_each_task(p) \ + for (p = &init_task ; (p = p->next_task) != &init_task ; ) + +#define for_each_thread(task) \ + for (task = next_thread(current) ; task != current ; task = next_thread(task)) + +#define next_thread(p) \ + list_entry((p)->thread_group.next, struct task_struct, thread_group) + +#define thread_group_leader(p) (p->pid == p->tgid) + +static inline void del_from_runqueue(struct task_struct * p) +{ + nr_running--; + p->sleep_time = jiffies; + list_del(&p->run_list); + p->run_list.next = NULL; +} + +static inline int task_on_runqueue(struct task_struct *p) +{ + return (p->run_list.next != NULL); +} + +static inline void unhash_process(struct task_struct *p) +{ + if (task_on_runqueue(p)) + out_of_line_bug(); + write_lock_irq(&tasklist_lock); + nr_threads--; + unhash_pid(p); + REMOVE_LINKS(p); + list_del(&p->thread_group); + write_unlock_irq(&tasklist_lock); +} + +/* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ +static inline void task_lock(struct task_struct *p) +{ + spin_lock(&p->alloc_lock); +} + +static inline void task_unlock(struct task_struct *p) +{ + spin_unlock(&p->alloc_lock); +} + +/* write full pathname into buffer and return start of pathname */ +static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, + char *buf, int buflen) +{ + char *res; + struct vfsmount *rootmnt; + struct dentry *root; + read_lock(¤t->fs->lock); + rootmnt = mntget(current->fs->rootmnt); + root = dget(current->fs->root); + read_unlock(¤t->fs->lock); + spin_lock(&dcache_lock); + res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen); + spin_unlock(&dcache_lock); + dput(root); + mntput(rootmnt); + return res; +} + +static inline int need_resched(void) +{ + return (unlikely(current->need_resched)); +} + +extern void __cond_resched(void); +static inline void cond_resched(void) +{ + if (need_resched()) + __cond_resched(); +} + +#endif /* __KERNEL__ */ +#endif diff --git a/xenolinux-2.4.24-sparse/include/linux/timer.h b/xenolinux-2.4.24-sparse/include/linux/timer.h new file mode 100644 index 0000000000..238083218f --- /dev/null +++ b/xenolinux-2.4.24-sparse/include/linux/timer.h @@ -0,0 +1,77 @@ +#ifndef _LINUX_TIMER_H +#define _LINUX_TIMER_H + +#include +#include + +/* + * In Linux 2.4, static timers have been removed from the kernel. + * Timers may be dynamically created and destroyed, and should be initialized + * by a call to init_timer() upon creation. + * + * The "data" field enables use of a common timeout function for several + * timeouts. You can use this field to distinguish between the different + * invocations. + */ +struct timer_list { + struct list_head list; + unsigned long expires; + unsigned long data; + void (*function)(unsigned long); +}; + +extern void add_timer(struct timer_list * timer); +extern int del_timer(struct timer_list * timer); +#ifdef CONFIG_NO_IDLE_HZ +extern struct timer_list *next_timer_event(void); +#endif + +#ifdef CONFIG_SMP +extern int del_timer_sync(struct timer_list * timer); +extern void sync_timers(void); +#else +#define del_timer_sync(t) del_timer(t) +#define sync_timers() do { } while (0) +#endif + +/* + * mod_timer is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * mod_timer(a,b) is equivalent to del_timer(a); a->expires = b; add_timer(a). + * If the timer is known to be not pending (ie, in the handler), mod_timer + * is less efficient than a->expires = b; add_timer(a). + */ +int mod_timer(struct timer_list *timer, unsigned long expires); + +extern void it_real_fn(unsigned long); + +static inline void init_timer(struct timer_list * timer) +{ + timer->list.next = timer->list.prev = NULL; +} + +static inline int timer_pending (const struct timer_list * timer) +{ + return timer->list.next != NULL; +} + +/* + * These inlines deal with timer wrapping correctly. You are + * strongly encouraged to use them + * 1. Because people otherwise forget + * 2. Because if the timer wrap changes in future you wont have to + * alter your driver code. + * + * time_after(a,b) returns true if the time a is after time b. + * + * Do this with "<0" and ">=0" to only test the sign of the result. A + * good compiler would generate better code (and a really good compiler + * wouldn't care). Gcc is currently neither. + */ +#define time_after(a,b) ((long)(b) - (long)(a) < 0) +#define time_before(a,b) time_after(b,a) + +#define time_after_eq(a,b) ((long)(a) - (long)(b) >= 0) +#define time_before_eq(a,b) time_after_eq(b,a) + +#endif diff --git a/xenolinux-2.4.24-sparse/kernel/panic.c b/xenolinux-2.4.24-sparse/kernel/panic.c index 871ea67fee..6ab619a607 100644 --- a/xenolinux-2.4.24-sparse/kernel/panic.c +++ b/xenolinux-2.4.24-sparse/kernel/panic.c @@ -110,7 +110,8 @@ NORET_TYPE void panic(const char * fmt, ...) #endif CHECK_EMERGENCY_SYNC #if defined(CONFIG_XENO) - HYPERVISOR_exit(); + HYPERVISOR_console_write(buf, strlen(buf)); + HYPERVISOR_exit(); #endif } } diff --git a/xenolinux-2.4.24-sparse/kernel/timer.c b/xenolinux-2.4.24-sparse/kernel/timer.c new file mode 100644 index 0000000000..567794ab26 --- /dev/null +++ b/xenolinux-2.4.24-sparse/kernel/timer.c @@ -0,0 +1,968 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers, kernel timekeeping, basic process system calls + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Timekeeping variables + */ + +long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */ + +/* The current time */ +struct timeval xtime __attribute__ ((aligned (16))); + +/* Don't completely fail for HZ > 500. */ +int tickadj = 500/HZ ? : 1; /* microsecs */ + +DECLARE_TASK_QUEUE(tq_timer); +DECLARE_TASK_QUEUE(tq_immediate); + +/* + * phase-lock loop variables + */ +/* TIME_ERROR prevents overwriting the CMOS clock */ +int time_state = TIME_OK; /* clock synchronization status */ +int time_status = STA_UNSYNC; /* clock status bits */ +long time_offset; /* time adjustment (us) */ +long time_constant = 2; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ +long time_phase; /* phase offset (scaled us) */ +long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; + /* frequency offset (scaled ppm)*/ +long time_adj; /* tick adjust (scaled 1 / HZ) */ +long time_reftime; /* time at last adjustment (s) */ + +long time_adjust; +long time_adjust_step; + +unsigned long event; + +extern int do_setitimer(int, struct itimerval *, struct itimerval *); + +unsigned long volatile jiffies; + +unsigned int * prof_buffer; +unsigned long prof_len; +unsigned long prof_shift; + +/* + * Event timer code + */ +#define TVN_BITS 6 +#define TVR_BITS 8 +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +struct timer_vec { + int index; + struct list_head vec[TVN_SIZE]; +}; + +struct timer_vec_root { + int index; + struct list_head vec[TVR_SIZE]; +}; + +static struct timer_vec tv5; +static struct timer_vec tv4; +static struct timer_vec tv3; +static struct timer_vec tv2; +static struct timer_vec_root tv1; + +static struct timer_vec * const tvecs[] = { + (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5 +}; + +static struct list_head * run_timer_list_running; + +#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0])) + +void init_timervecs (void) +{ + int i; + + for (i = 0; i < TVN_SIZE; i++) { + INIT_LIST_HEAD(tv5.vec + i); + INIT_LIST_HEAD(tv4.vec + i); + INIT_LIST_HEAD(tv3.vec + i); + INIT_LIST_HEAD(tv2.vec + i); + } + for (i = 0; i < TVR_SIZE; i++) + INIT_LIST_HEAD(tv1.vec + i); +} + +static unsigned long timer_jiffies; + +static inline void internal_add_timer(struct timer_list *timer) +{ + /* + * must be cli-ed when calling this + */ + unsigned long expires = timer->expires; + unsigned long idx = expires - timer_jiffies; + struct list_head * vec; + + if (run_timer_list_running) + vec = run_timer_list_running; + else if (idx < TVR_SIZE) { + int i = expires & TVR_MASK; + vec = tv1.vec + i; + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> TVR_BITS) & TVN_MASK; + vec = tv2.vec + i; + } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + vec = tv3.vec + i; + } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; + vec = tv4.vec + i; + } else if ((signed long) idx < 0) { + /* can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + vec = tv1.vec + tv1.index; + } else if (idx <= 0xffffffffUL) { + int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + vec = tv5.vec + i; + } else { + /* Can only get here on architectures with 64-bit jiffies */ + INIT_LIST_HEAD(&timer->list); + return; + } + /* + * Timers are FIFO! + */ + list_add(&timer->list, vec->prev); +} + +/* Initialize both explicitly - let's try to have them in the same cache line */ +spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED; + +#ifdef CONFIG_SMP +volatile struct timer_list * volatile running_timer; +#define timer_enter(t) do { running_timer = t; mb(); } while (0) +#define timer_exit() do { running_timer = NULL; } while (0) +#define timer_is_running(t) (running_timer == t) +#define timer_synchronize(t) while (timer_is_running(t)) barrier() +#else +#define timer_enter(t) do { } while (0) +#define timer_exit() do { } while (0) +#endif + +void add_timer(struct timer_list *timer) +{ + unsigned long flags; + + spin_lock_irqsave(&timerlist_lock, flags); + if (timer_pending(timer)) + goto bug; + internal_add_timer(timer); + spin_unlock_irqrestore(&timerlist_lock, flags); + return; +bug: + spin_unlock_irqrestore(&timerlist_lock, flags); + printk("bug: kernel timer added twice at %p.\n", + __builtin_return_address(0)); +} + +static inline int detach_timer (struct timer_list *timer) +{ + if (!timer_pending(timer)) + return 0; + list_del(&timer->list); + return 1; +} + +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&timerlist_lock, flags); + timer->expires = expires; + ret = detach_timer(timer); + internal_add_timer(timer); + spin_unlock_irqrestore(&timerlist_lock, flags); + return ret; +} + +int del_timer(struct timer_list * timer) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&timerlist_lock, flags); + ret = detach_timer(timer); + timer->list.next = timer->list.prev = NULL; + spin_unlock_irqrestore(&timerlist_lock, flags); + return ret; +} + +#ifdef CONFIG_SMP +void sync_timers(void) +{ + spin_unlock_wait(&global_bh_lock); +} + +/* + * SMP specific function to delete periodic timer. + * Caller must disable by some means restarting the timer + * for new. Upon exit the timer is not queued and handler is not running + * on any CPU. It returns number of times, which timer was deleted + * (for reference counting). + */ + +int del_timer_sync(struct timer_list * timer) +{ + int ret = 0; + + for (;;) { + unsigned long flags; + int running; + + spin_lock_irqsave(&timerlist_lock, flags); + ret += detach_timer(timer); + timer->list.next = timer->list.prev = 0; + running = timer_is_running(timer); + spin_unlock_irqrestore(&timerlist_lock, flags); + + if (!running) + break; + + timer_synchronize(timer); + } + + return ret; +} +#endif + + +static inline void cascade_timers(struct timer_vec *tv) +{ + /* cascade all the timers from tv up one level */ + struct list_head *head, *curr, *next; + + head = tv->vec + tv->index; + curr = head->next; + /* + * We are removing _all_ timers from the list, so we don't have to + * detach them individually, just clear the list afterwards. + */ + while (curr != head) { + struct timer_list *tmp; + + tmp = list_entry(curr, struct timer_list, list); + next = curr->next; + list_del(curr); // not needed + internal_add_timer(tmp); + curr = next; + } + INIT_LIST_HEAD(head); + tv->index = (tv->index + 1) & TVN_MASK; +} + +static inline void run_timer_list(void) +{ + spin_lock_irq(&timerlist_lock); + while ((long)(jiffies - timer_jiffies) >= 0) { + LIST_HEAD(queued); + struct list_head *head, *curr; + if (!tv1.index) { + int n = 1; + do { + cascade_timers(tvecs[n]); + } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS); + } + run_timer_list_running = &queued; +repeat: + head = tv1.vec + tv1.index; + curr = head->next; + if (curr != head) { + struct timer_list *timer; + void (*fn)(unsigned long); + unsigned long data; + + timer = list_entry(curr, struct timer_list, list); + fn = timer->function; + data= timer->data; + + detach_timer(timer); + timer->list.next = timer->list.prev = NULL; + timer_enter(timer); + spin_unlock_irq(&timerlist_lock); + fn(data); + spin_lock_irq(&timerlist_lock); + timer_exit(); + goto repeat; + } + run_timer_list_running = NULL; + ++timer_jiffies; + tv1.index = (tv1.index + 1) & TVR_MASK; + + curr = queued.next; + while (curr != &queued) { + struct timer_list *timer; + + timer = list_entry(curr, struct timer_list, list); + curr = curr->next; + internal_add_timer(timer); + } + } + spin_unlock_irq(&timerlist_lock); +} + +#ifdef CONFIG_NO_IDLE_HZ +/* + * Find out when the next timer event is due to happen. This + * is used on S/390 to stop all activity when all cpus are idle. + * And in XenoLinux to achieve the same. + * The timerlist_lock must be acquired before calling this function. + */ +struct timer_list *next_timer_event(void) +{ + struct timer_list *nte, *tmp; + struct list_head *lst; + int i, j; + + /* Look for the next timer event in tv1. */ + i = 0; + j = tvecs[0]->index; + do { + struct list_head *head = tvecs[0]->vec + j; + if (!list_empty(head)) { + nte = list_entry(head->next, struct timer_list, list); + goto found; + } + j = (j + 1) & TVR_MASK; + } while (j != tv1.index); + + /* No event found in tv1. Check tv2-tv5. */ + for (i = 1; i < NOOF_TVECS; i++) { + j = tvecs[i]->index; + do { + nte = NULL; + list_for_each(lst, tvecs[i]->vec + j) { + tmp = list_entry(lst, struct timer_list, list); + if (nte == NULL || + time_before(tmp->expires, nte->expires)) + nte = tmp; + } + if (nte) + goto found; + j = (j + 1) & TVN_MASK; + } while (j != tvecs[i]->index); + } + return NULL; +found: + /* Found timer event in tvecs[i]->vec[j] */ + if (j < tvecs[i]->index && i < NOOF_TVECS-1) { + /* + * The search wrapped. We need to look at the next list + * from tvecs[i+1] that would cascade into tvecs[i]. + */ + list_for_each(lst, tvecs[i+1]->vec+tvecs[i+1]->index) { + tmp = list_entry(lst, struct timer_list, list); + if (time_before(tmp->expires, nte->expires)) + nte = tmp; + } + } + return nte; +} +#endif + +spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED; + +void tqueue_bh(void) +{ + run_task_queue(&tq_timer); +} + +void immediate_bh(void) +{ + run_task_queue(&tq_immediate); +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + */ +static void second_overflow(void) +{ + long ltemp; + + /* Bump the maxerror field */ + time_maxerror += time_tolerance >> SHIFT_USEC; + if ( time_maxerror > NTP_PHASE_LIMIT ) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The microtime() routine or + * external clock driver will insure that reported time + * is always monotonic. The ugly divides should be + * replaced. + */ + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + time_state = TIME_OOP; + printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); + } + break; + + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + time_state = TIME_WAIT; + printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); + } + break; + + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the phase adjustment for the next second. In + * PLL mode, the offset is reduced by a fixed factor + * times the time constant. In FLL mode the offset is + * used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread + * the adjustment over not more than the number of + * seconds between updates. + */ + if (time_offset < 0) { + ltemp = -time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset += ltemp; + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } else { + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } + + /* + * Compute the frequency estimate and additional phase + * adjustment due to frequency error for the next + * second. When the PPS signal is engaged, gnaw on the + * watchdog counter and update the frequency computed by + * the pll and the PPS signal. + */ + pps_valid++; + if (pps_valid == PPS_VALID) { /* PPS signal lost */ + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; + if (ltemp < 0) + time_adj -= -ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + else + time_adj += ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + +#if HZ == 100 + /* Compensate for (HZ==100) != (1 << SHIFT_HZ). + * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) + */ + if (time_adj < 0) + time_adj -= (-time_adj >> 2) + (-time_adj >> 5); + else + time_adj += (time_adj >> 2) + (time_adj >> 5); +#endif +} + +/* in the NTP reference this is called "hardclock()" */ +static void update_wall_time_one_tick(void) +{ + if ( (time_adjust_step = time_adjust) != 0 ) { + /* We are doing an adjtime thing. + * + * Prepare time_adjust_step to be within bounds. + * Note that a positive time_adjust means we want the clock + * to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + if (time_adjust > tickadj) + time_adjust_step = tickadj; + else if (time_adjust < -tickadj) + time_adjust_step = -tickadj; + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; + } + xtime.tv_usec += tick + time_adjust_step; + /* + * Advance the phase, once it gets to one microsecond, then + * advance the tick more. + */ + time_phase += time_adj; + if (time_phase <= -FINEUSEC) { + long ltemp = -time_phase >> SHIFT_SCALE; + time_phase += ltemp << SHIFT_SCALE; + xtime.tv_usec -= ltemp; + } + else if (time_phase >= FINEUSEC) { + long ltemp = time_phase >> SHIFT_SCALE; + time_phase -= ltemp << SHIFT_SCALE; + xtime.tv_usec += ltemp; + } +} + +/* + * Using a loop looks inefficient, but "ticks" is + * usually just one (we shouldn't be losing ticks, + * we're doing this this way mainly for interrupt + * latency reasons, not because we think we'll + * have lots of lost timer ticks + */ +static void update_wall_time(unsigned long ticks) +{ + do { + ticks--; + update_wall_time_one_tick(); + } while (ticks); + + while (xtime.tv_usec >= 1000000) { + xtime.tv_usec -= 1000000; + xtime.tv_sec++; + second_overflow(); + } +} + +static inline void do_process_times(struct task_struct *p, + unsigned long user, unsigned long system) +{ + unsigned long psecs; + + psecs = (p->times.tms_utime += user); + psecs += (p->times.tms_stime += system); + if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) { + /* Send SIGXCPU every second.. */ + if (!(psecs % HZ)) + send_sig(SIGXCPU, p, 1); + /* and SIGKILL when we go over max.. */ + if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max) + send_sig(SIGKILL, p, 1); + } +} + +static inline void do_it_virt(struct task_struct * p, unsigned long ticks) +{ + unsigned long it_virt = p->it_virt_value; + + if (it_virt) { + it_virt -= ticks; + if (!it_virt) { + it_virt = p->it_virt_incr; + send_sig(SIGVTALRM, p, 1); + } + p->it_virt_value = it_virt; + } +} + +static inline void do_it_prof(struct task_struct *p) +{ + unsigned long it_prof = p->it_prof_value; + + if (it_prof) { + if (--it_prof == 0) { + it_prof = p->it_prof_incr; + send_sig(SIGPROF, p, 1); + } + p->it_prof_value = it_prof; + } +} + +void update_one_process(struct task_struct *p, unsigned long user, + unsigned long system, int cpu) +{ + p->per_cpu_utime[cpu] += user; + p->per_cpu_stime[cpu] += system; + do_process_times(p, user, system); + do_it_virt(p, user); + do_it_prof(p); +} + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + int cpu = smp_processor_id(), system = user_tick ^ 1; + + update_one_process(p, user_tick, system, cpu); + if (p->pid) { + if (--p->counter <= 0) { + p->counter = 0; + /* + * SCHED_FIFO is priority preemption, so this is + * not the place to decide whether to reschedule a + * SCHED_FIFO task or not - Bhavesh Davda + */ + if (p->policy != SCHED_FIFO) { + p->need_resched = 1; + } + } + if (p->nice > 0) + kstat.per_cpu_nice[cpu] += user_tick; + else + kstat.per_cpu_user[cpu] += user_tick; + kstat.per_cpu_system[cpu] += system; + } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += system; +} + +/* + * Called from the timer interrupt handler to charge a couple of ticks + * to the current process. + */ +void update_process_times_us(int user_ticks, int system_ticks) +{ + struct task_struct *p = current; + int cpu = smp_processor_id(); + + update_one_process(p, user_ticks, system_ticks, cpu); + if (p->pid) { + p->counter -= user_ticks + system_ticks; + if (p->counter <= 0) { + p->counter = 0; + p->need_resched = 1; + } + if (p->nice > 0) + kstat.per_cpu_nice[cpu] += user_ticks; + else + kstat.per_cpu_user[cpu] += user_ticks; + kstat.per_cpu_system[cpu] += system_ticks; + } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += system_ticks; +} + +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_tasks(void) +{ + struct task_struct *p; + unsigned long nr = 0; + + read_lock(&tasklist_lock); + for_each_task(p) { + if ((p->state == TASK_RUNNING || + (p->state & TASK_UNINTERRUPTIBLE))) + nr += FIXED_1; + } + read_unlock(&tasklist_lock); + return nr; +} + +/* + * Hmm.. Changed this, as the GNU make sources (load.c) seems to + * imply that avenrun[] is the standard name for this kind of thing. + * Nothing else seems to be standardized: the fractional size etc + * all seem to differ on different machines. + */ +unsigned long avenrun[3]; + +static inline void calc_load(unsigned long ticks) +{ + unsigned long active_tasks; /* fixed-point */ + static int count = LOAD_FREQ; + + count -= ticks; + while (count < 0) { + count += LOAD_FREQ; + active_tasks = count_active_tasks(); + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + } +} + +/* jiffies at the most recent update of wall time */ +unsigned long wall_jiffies; + +/* + * This spinlock protect us from races in SMP while playing with xtime. -arca + */ +rwlock_t xtime_lock = RW_LOCK_UNLOCKED; + +static inline void update_times(void) +{ + unsigned long ticks; + + /* + * update_times() is run from the raw timer_bh handler so we + * just know that the irqs are locally enabled and so we don't + * need to save/restore the flags of the local CPU here. -arca + */ + write_lock_irq(&xtime_lock); + vxtime_lock(); + + ticks = jiffies - wall_jiffies; + if (ticks) { + wall_jiffies += ticks; + update_wall_time(ticks); + } + vxtime_unlock(); + write_unlock_irq(&xtime_lock); + calc_load(ticks); +} + +void timer_bh(void) +{ + update_times(); + run_timer_list(); +} + +void do_timer(struct pt_regs *regs) +{ + (*(unsigned long *)&jiffies)++; +#ifndef CONFIG_SMP + /* SMP process accounting uses the local APIC timer */ + + update_process_times(user_mode(regs)); +#endif + mark_bh(TIMER_BH); + if (TQ_ACTIVE(tq_timer)) + mark_bh(TQUEUE_BH); +} + +void do_timer_ticks(int ticks) +{ + (*(unsigned long *)&jiffies) += ticks; + mark_bh(TIMER_BH); + if (TQ_ACTIVE(tq_timer)) + mark_bh(TQUEUE_BH); +} + +#if !defined(__alpha__) && !defined(__ia64__) + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +asmlinkage unsigned long sys_alarm(unsigned int seconds) +{ + struct itimerval it_new, it_old; + unsigned int oldalarm; + + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + do_setitimer(ITIMER_REAL, &it_new, &it_old); + oldalarm = it_old.it_value.tv_sec; + /* ehhh.. We can't return 0 if we have an alarm pending.. */ + /* And we'd better return too much than too little anyway */ + if (it_old.it_value.tv_usec) + oldalarm++; + return oldalarm; +} + +#endif + +#ifndef __alpha__ + +/* + * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this + * should be moved into arch/i386 instead? + */ + +/** + * sys_getpid - return the thread group id of the current process + * + * Note, despite the name, this returns the tgid not the pid. The tgid and + * the pid are identical unless CLONE_THREAD was specified on clone() in + * which case the tgid is the same in all threads of the same group. + * + * This is SMP safe as current->tgid does not change. + */ +asmlinkage long sys_getpid(void) +{ + return current->tgid; +} + +/* + * This is not strictly SMP safe: p_opptr could change + * from under us. However, rather than getting any lock + * we can use an optimistic algorithm: get the parent + * pid, and go back and check that the parent is still + * the same. If it has changed (which is extremely unlikely + * indeed), we just try again.. + * + * NOTE! This depends on the fact that even if we _do_ + * get an old value of "parent", we can happily dereference + * the pointer: we just can't necessarily trust the result + * until we know that the parent pointer is valid. + * + * The "mb()" macro is a memory barrier - a synchronizing + * event. It also makes sure that gcc doesn't optimize + * away the necessary memory references.. The barrier doesn't + * have to have all that strong semantics: on x86 we don't + * really require a synchronizing instruction, for example. + * The barrier is more important for code generation than + * for any real memory ordering semantics (even if there is + * a small window for a race, using the old pointer is + * harmless for a while). + */ +asmlinkage long sys_getppid(void) +{ + int pid; + struct task_struct * me = current; + struct task_struct * parent; + + parent = me->p_opptr; + for (;;) { + pid = parent->pid; +#if CONFIG_SMP +{ + struct task_struct *old = parent; + mb(); + parent = me->p_opptr; + if (old != parent) + continue; +} +#endif + break; + } + return pid; +} + +asmlinkage long sys_getuid(void) +{ + /* Only we change this so SMP safe */ + return current->uid; +} + +asmlinkage long sys_geteuid(void) +{ + /* Only we change this so SMP safe */ + return current->euid; +} + +asmlinkage long sys_getgid(void) +{ + /* Only we change this so SMP safe */ + return current->gid; +} + +asmlinkage long sys_getegid(void) +{ + /* Only we change this so SMP safe */ + return current->egid; +} + +#endif + +/* Thread ID - the internal kernel "pid" */ +asmlinkage long sys_gettid(void) +{ + return current->pid; +} + +asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) +{ + struct timespec t; + unsigned long expire; + + if(copy_from_user(&t, rqtp, sizeof(struct timespec))) + return -EFAULT; + + if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0) + return -EINVAL; + + + if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && + current->policy != SCHED_OTHER) + { + /* + * Short delay requests up to 2 ms will be handled with + * high precision by a busy wait for all real-time processes. + * + * Its important on SMP not to do this holding locks. + */ + udelay((t.tv_nsec + 999) / 1000); + return 0; + } + + expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); + + current->state = TASK_INTERRUPTIBLE; + expire = schedule_timeout(expire); + + if (expire) { + if (rmtp) { + jiffies_to_timespec(expire, &t); + if (copy_to_user(rmtp, &t, sizeof(struct timespec))) + return -EFAULT; + } + return -EINTR; + } + return 0; +} + -- 2.30.2